1.5. Housing in Brazil 🇧🇷

In [2]:
# Import Matplotlib, pandas, and plotly
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

Task 1.5.1

In [3]:
df1 = pd.read_csv("data/brasil-real-estate-1.csv")
df1.head()
Out[3]:
property_type place_with_parent_names region lat-lon area_m2 price_usd
0 apartment |Brasil|Alagoas|Maceió| Northeast -9.6443051,-35.7088142 110.0 $187,230.85
1 apartment |Brasil|Alagoas|Maceió| Northeast -9.6430934,-35.70484 65.0 $81,133.37
2 house |Brasil|Alagoas|Maceió| Northeast -9.6227033,-35.7297953 211.0 $154,465.45
3 apartment |Brasil|Alagoas|Maceió| Northeast -9.622837,-35.719556 99.0 $146,013.20
4 apartment |Brasil|Alagoas|Maceió| Northeast -9.654955,-35.700227 55.0 $101,416.71

Task 1.5.2

In [4]:
df1.dropna(inplace=True)
df1.head()
Out[4]:
property_type place_with_parent_names region lat-lon area_m2 price_usd
0 apartment |Brasil|Alagoas|Maceió| Northeast -9.6443051,-35.7088142 110.0 $187,230.85
1 apartment |Brasil|Alagoas|Maceió| Northeast -9.6430934,-35.70484 65.0 $81,133.37
2 house |Brasil|Alagoas|Maceió| Northeast -9.6227033,-35.7297953 211.0 $154,465.45
3 apartment |Brasil|Alagoas|Maceió| Northeast -9.622837,-35.719556 99.0 $146,013.20
4 apartment |Brasil|Alagoas|Maceió| Northeast -9.654955,-35.700227 55.0 $101,416.71

Task 1.5.3

In [5]:
df1[["lat", "lon"]] = df1["lat-lon"].str.split(",", expand=True)

# Convert them to float
df1["lat"] = df1["lat"].astype(float)
df1["lon"] = df1["lon"].astype(float)

# Check result
df1.head()
Out[5]:
property_type place_with_parent_names region lat-lon area_m2 price_usd lat lon
0 apartment |Brasil|Alagoas|Maceió| Northeast -9.6443051,-35.7088142 110.0 $187,230.85 -9.644305 -35.708814
1 apartment |Brasil|Alagoas|Maceió| Northeast -9.6430934,-35.70484 65.0 $81,133.37 -9.643093 -35.704840
2 house |Brasil|Alagoas|Maceió| Northeast -9.6227033,-35.7297953 211.0 $154,465.45 -9.622703 -35.729795
3 apartment |Brasil|Alagoas|Maceió| Northeast -9.622837,-35.719556 99.0 $146,013.20 -9.622837 -35.719556
4 apartment |Brasil|Alagoas|Maceió| Northeast -9.654955,-35.700227 55.0 $101,416.71 -9.654955 -35.700227

Task 1.5.4

In [6]:
df1["state"] = df1["place_with_parent_names"].str.split("|").str[2]

# Check result
df1[["place_with_parent_names", "state"]].head()
Out[6]:
place_with_parent_names state
0 |Brasil|Alagoas|Maceió| Alagoas
1 |Brasil|Alagoas|Maceió| Alagoas
2 |Brasil|Alagoas|Maceió| Alagoas
3 |Brasil|Alagoas|Maceió| Alagoas
4 |Brasil|Alagoas|Maceió| Alagoas

Task 1.5.5

In [7]:
df1["price_usd"] = (
    df1["price_usd"]
    .astype(str)                            # ensure strings
    .str.replace(r"[$,]", "", regex=True)   # remove $ and ,
    .astype(float)                          # convert to float
)
df1.head()
Out[7]:
property_type place_with_parent_names region lat-lon area_m2 price_usd lat lon state
0 apartment |Brasil|Alagoas|Maceió| Northeast -9.6443051,-35.7088142 110.0 187230.85 -9.644305 -35.708814 Alagoas
1 apartment |Brasil|Alagoas|Maceió| Northeast -9.6430934,-35.70484 65.0 81133.37 -9.643093 -35.704840 Alagoas
2 house |Brasil|Alagoas|Maceió| Northeast -9.6227033,-35.7297953 211.0 154465.45 -9.622703 -35.729795 Alagoas
3 apartment |Brasil|Alagoas|Maceió| Northeast -9.622837,-35.719556 99.0 146013.20 -9.622837 -35.719556 Alagoas
4 apartment |Brasil|Alagoas|Maceió| Northeast -9.654955,-35.700227 55.0 101416.71 -9.654955 -35.700227 Alagoas

Task 1.5.6

In [8]:
df1.drop(columns=["place_with_parent_names"], inplace=True)
df1.head()
Out[8]:
property_type region lat-lon area_m2 price_usd lat lon state
0 apartment Northeast -9.6443051,-35.7088142 110.0 187230.85 -9.644305 -35.708814 Alagoas
1 apartment Northeast -9.6430934,-35.70484 65.0 81133.37 -9.643093 -35.704840 Alagoas
2 house Northeast -9.6227033,-35.7297953 211.0 154465.45 -9.622703 -35.729795 Alagoas
3 apartment Northeast -9.622837,-35.719556 99.0 146013.20 -9.622837 -35.719556 Alagoas
4 apartment Northeast -9.654955,-35.700227 55.0 101416.71 -9.654955 -35.700227 Alagoas

Task 1.5.7

In [9]:
df2 = pd.read_csv("data/brasil-real-estate-2.csv")
df2.head()
Out[9]:
property_type state region lat lon area_m2 price_brl
0 apartment Pernambuco Northeast -8.134204 -34.906326 72.0 414222.98
1 apartment Pernambuco Northeast -8.126664 -34.903924 136.0 848408.53
2 apartment Pernambuco Northeast -8.125550 -34.907601 75.0 299438.28
3 apartment Pernambuco Northeast -8.120249 -34.895920 187.0 848408.53
4 apartment Pernambuco Northeast -8.142666 -34.906906 80.0 464129.36

Task 1.5.8

In [10]:
df2["price_usd"] =  (df2["price_brl"] / 3.19).round(2)
df2.head()
Out[10]:
property_type state region lat lon area_m2 price_brl price_usd
0 apartment Pernambuco Northeast -8.134204 -34.906326 72.0 414222.98 129850.46
1 apartment Pernambuco Northeast -8.126664 -34.903924 136.0 848408.53 265958.79
2 apartment Pernambuco Northeast -8.125550 -34.907601 75.0 299438.28 93867.80
3 apartment Pernambuco Northeast -8.120249 -34.895920 187.0 848408.53 265958.79
4 apartment Pernambuco Northeast -8.142666 -34.906906 80.0 464129.36 145495.10

Task 1.5.9

In [11]:
df2.drop(columns=["price_brl"], inplace=True)
df2.dropna(inplace=True)
df2.head()
Out[11]:
property_type state region lat lon area_m2 price_usd
0 apartment Pernambuco Northeast -8.134204 -34.906326 72.0 129850.46
1 apartment Pernambuco Northeast -8.126664 -34.903924 136.0 265958.79
2 apartment Pernambuco Northeast -8.125550 -34.907601 75.0 93867.80
3 apartment Pernambuco Northeast -8.120249 -34.895920 187.0 265958.79
4 apartment Pernambuco Northeast -8.142666 -34.906906 80.0 145495.10

Task 1.5.10

In [12]:
df = pd.concat([df1, df2])
print("df shape:", df.shape)
df.to_csv("data/brasil-real-estate-clean.csv", index=False)
df.head()
df shape: (22844, 8)
Out[12]:
property_type region lat-lon area_m2 price_usd lat lon state
0 apartment Northeast -9.6443051,-35.7088142 110.0 187230.85 -9.644305 -35.708814 Alagoas
1 apartment Northeast -9.6430934,-35.70484 65.0 81133.37 -9.643093 -35.704840 Alagoas
2 house Northeast -9.6227033,-35.7297953 211.0 154465.45 -9.622703 -35.729795 Alagoas
3 apartment Northeast -9.622837,-35.719556 99.0 146013.20 -9.622837 -35.719556 Alagoas
4 apartment Northeast -9.654955,-35.700227 55.0 101416.71 -9.654955 -35.700227 Alagoas

Explore¶

In [13]:
fig = px.scatter_mapbox(
    df,
    lat="lat", 
    lon="lon", 
    center={"lat": -14.2, "lon": -51.9},  # Map will be centered on Brazil
    width=600,
    height=600,
    hover_data=["price_usd"],  # Display price when hovering mouse over house
)

fig.update_layout(mapbox_style="open-street-map")

fig.show()

Task 1.5.11

In [14]:
summary_stats = df[["area_m2", "price_usd"]].describe()
summary_stats
Out[14]:
area_m2 price_usd
count 22844.000000 22844.000000
mean 115.020224 194987.315515
std 47.742932 103617.682979
min 53.000000 74892.340000
25% 76.000000 113898.770000
50% 103.000000 165697.555000
75% 142.000000 246900.882500
max 252.000000 525659.720000
Slight Code Change

In the following task, you'll notice a small change in how plots are created compared to what you saw in the lessons. While the lessons use the global matplotlib method like plt.plot(...), in this task, you are expected to use the object-oriented (OOP) API instead. This means creating your plots using fig, ax = plt.subplots() and then calling plotting methods on the ax object, such as ax.plot(...), ax.hist(...), or ax.scatter(...).

If you're using pandas’ or seaborn’s built-in plotting methods (like df.plot() or sns.lineplot()), make sure to pass the ax=ax argument so that the plot is rendered on the correct axes.

This approach is considered best practice and will be used consistently across all graded tasks that involve matplotlib.

Task 1.5.12

In [15]:
# Don't change the code below 👇
fig, ax = plt.subplots()
prices = df["price_usd"].head(20000).dropna()
# Build histogram
ax.hist(prices)

# Label axes
plt.xlabel("Price [USD]")
plt.ylabel("Frequency")

# Add title
plt.title("Distribution of Home Prices")
Out[15]:
Text(0.5, 1.0, 'Distribution of Home Prices')
No description has been provided for this image

Task 1.5.13

In [16]:
# Don't change the code below 👇
fig, ax = plt.subplots()

#Build box plot
ax.boxplot(df["area_m2"], vert=False)

# Label x-axis
plt.xlabel("Area [sq meters]")

# Add title
plt.title("Distribution of Home Sizes")
Out[16]:
Text(0.5, 1.0, 'Distribution of Home Sizes')
No description has been provided for this image

Task 1.5.14

In [17]:
mean_price_by_region = df.groupby("region")["price_usd"].mean()

mean_price_by_region = mean_price_by_region.sort_values()
mean_price_by_region 
Out[17]:
region
Central-West    178596.283663
North           181308.958188
Northeast       185422.985482
South           189012.345360
Southeast       208996.762761
Name: price_usd, dtype: float64

Task 1.5.15

In [18]:
# Don't change the code below 👇
fig, ax = plt.subplots()


# Build bar chart, label axes, add title
mean_price_by_region.plot(
    kind='bar',
    xlabel="Region",
    ylabel="Mean Price [USD]",
    title="Mean Home Price by Region"
    , ax=ax)
Out[18]:
<Axes: title={'center': 'Mean Home Price by Region'}, xlabel='Region', ylabel='Mean Price [USD]'>
No description has been provided for this image

Task 1.5.16

In [19]:
df.drop(columns="lat-lon", inplace=True)
df_south = df[df["region"] == "South"]
df_south.head()
Out[19]:
property_type region area_m2 price_usd lat lon state
9304 apartment South 127.0 296448.85 -25.455704 -49.292918 Paraná
9305 apartment South 104.0 219996.25 -25.455704 -49.292918 Paraná
9306 apartment South 100.0 194210.50 -25.460236 -49.293812 Paraná
9307 apartment South 77.0 149252.94 -25.460236 -49.293812 Paraná
9308 apartment South 73.0 144167.75 -25.460236 -49.293812 Paraná

Task 1.5.17

In [20]:
homes_by_state = df_south["state"].value_counts()
homes_by_state
Out[20]:
Rio Grande do Sul    2643
Santa Catarina       2634
Paraná               2544
Name: state, dtype: int64

Task 1.5.18

In [21]:
# Subset data
df_south_rgs = df_south["state"].value_counts().idxmax()

# Don't change the code below 👇
fig, ax = plt.subplots()
df_top_state = df_south[df_south["state"] == df_south_rgs]

# Build scatter plot
ax.scatter(x=df_top_state["area_m2"], y=df_top_state["price_usd"])

# Label axes
plt.xlabel("Area [sq meters]")
plt.ylabel("Price [USD]")

# Add title
plt.title(f"{df_south_rgs}: Price vs. Area")
Out[21]:
Text(0.5, 1.0, 'Rio Grande do Sul: Price vs. Area')
No description has been provided for this image

Task 1.5.19

In [22]:
 # First, filter only South region data
df_south = df[df["region"] == "South"]

# Create the dictionary with correlations
south_states_corr = {}

# Loop through the three states
for state in df_south["state"].unique():
    state_data = df_south[df_south["state"] == state]
    corr = state_data["area_m2"].corr(state_data["price_usd"])
    south_states_corr[state] = corr

south_states_corr
Out[22]:
{'Paraná': 0.5436659935502657,
 'Rio Grande do Sul': 0.5773267433871904,
 'Santa Catarina': 0.5068121769989854}

Copyright 2024 WorldQuant University. This content is licensed solely for personal use. Redistribution or publication of this material is strictly prohibited.